In [1]:
import pandas as pd
import plotly.graph_objects as go
In [2]:
import pandas as pd
# Specify the path to your CSV file
csv_file_path = r"C:\Users\User\Desktop\Data Analytics\activesellers.csv"
# Load the CSV file into a Pandas DataFrame
df = pd.read_csv(csv_file_path)
In [3]:
print(df.head())
print(df.info())
print(df.describe())
merchantid listedproducts totalunitssold \
0 5357bcf2bb72c5504882e889 2 120000
1 5708773c3c02161b3f8c7900 5 107100
2 5417aada4ad3ab27e954b76c 2 100007
3 570f3a713a698c14278bb51e 1 100000
4 53082ea15aefb07dfe1f2a4f 1 100000
meanunitssoldperproduct rating merchantratingscount meanproductprices \
0 60000.0 4.219 320031.0 9.00
1 21420.0 3.934 139223.0 7.76
2 50004.0 4.053 108048.0 8.00
3 100000.0 3.889 19248.0 5.67
4 100000.0 4.036 366898.0 5.00
meanretailprices averagediscount meandiscount meanproductratingscount \
0 20.0 54.0 54.0 8836.0
1 34.2 61.0 61.0 4010.0
2 8.0 -1.0 -1.0 5531.0
3 19.0 71.0 71.0 18393.0
4 33.0 85.0 85.0 13789.0
totalurgencycount urgencytextrate
0 1.0 50.0
1 3.0 60.0
2 NaN NaN
3 1.0 100.0
4 NaN NaN
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 958 entries, 0 to 957
Data columns (total 13 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 merchantid 958 non-null object
1 listedproducts 958 non-null int64
2 totalunitssold 958 non-null int64
3 meanunitssoldperproduct 958 non-null float64
4 rating 958 non-null float64
5 merchantratingscount 958 non-null float64
6 meanproductprices 958 non-null float64
7 meanretailprices 958 non-null float64
8 averagediscount 958 non-null float64
9 meandiscount 958 non-null float64
10 meanproductratingscount 958 non-null float64
11 totalurgencycount 391 non-null float64
12 urgencytextrate 391 non-null float64
dtypes: float64(10), int64(2), object(1)
memory usage: 97.4+ KB
None
listedproducts totalunitssold meanunitssoldperproduct rating \
count 958.000000 958.000000 958.000000 958.000000
mean 1.641962 7124.483299 4407.605428 4.043994
std 1.246183 14363.521893 9167.077812 0.222137
min 1.000000 1.000000 1.000000 2.333000
25% 1.000000 100.000000 100.000000 3.931000
50% 1.000000 1000.000000 1000.000000 4.055500
75% 2.000000 10000.000000 5000.000000 4.190000
max 15.000000 120000.000000 100000.000000 5.000000
merchantratingscount meanproductprices meanretailprices \
count 9.580000e+02 958.000000 958.000000
mean 2.202045e+04 8.634906 24.802265
std 8.473232e+04 3.987599 30.258719
min 0.000000e+00 1.000000 1.000000
25% 1.373000e+03 6.000000 7.000000
50% 5.990500e+03 8.000000 11.000000
75% 1.723850e+04 11.000000 28.875000
max 2.174765e+06 49.000000 252.000000
averagediscount meandiscount meanproductratingscount \
count 958.000000 958.000000 958.000000
mean 28.721294 28.698017 922.538622
std 39.918708 39.914269 1925.750178
min -18.000000 -18.000000 0.000000
25% -10.000000 -10.000000 31.000000
50% 16.000000 16.000000 210.500000
75% 71.000000 71.000000 920.750000
max 97.000000 97.000000 20744.000000
totalurgencycount urgencytextrate
count 391.000000 391.000000
mean 1.209719 65.572890
std 0.583542 28.476426
min 1.000000 14.000000
25% 1.000000 50.000000
50% 1.000000 50.000000
75% 1.000000 100.000000
max 6.000000 100.000000
In [4]:
import matplotlib.pyplot as plt
plt.hist(df['totalunitssold'], bins=20, color='skyblue', edgecolor='black')
plt.xlabel('Total Units Sold')
plt.ylabel('Frequency')
plt.title('Distribution of Total Units Sold')
plt.show()
In [5]:
plt.scatter(df['totalunitssold'], df['rating'], color='green')
plt.xlabel('Total Units Sold')
plt.ylabel('Rating')
plt.title('Relationship between Total Units Sold and Rating')
plt.show()
In [6]:
plt.bar(df['merchantid'][:10], df['listedproducts'][:10], color='orange')
plt.xlabel('Merchant ID')
plt.ylabel('Listed Products')
plt.title('Top 10 Merchants by Listed Products')
plt.xticks(rotation=45)
plt.show()
In [7]:
plt.boxplot(df['meanunitssoldperproduct'], vert=False)
plt.xlabel('Mean Units Sold per Product')
plt.title('Box Plot of Mean Units Sold per Product')
plt.show()
In [8]:
plt.plot(df['meanproductprices'], color='red')
plt.xlabel('Index')
plt.ylabel('Mean Product Prices')
plt.title('Trend of Mean Product Prices')
plt.show()
In [9]:
plt.pie(df['listedproducts'][:5], labels=df['merchantid'][:5], autopct='%1.1f%%', startangle=140)
plt.axis('equal')
plt.title('Proportion of Listed Products by Top 5 Merchants')
plt.show()
In [12]:
import plotly.express as px
fig = px.scatter_geo(df, lat='listedproducts', lon='totalunitssold')
# Update layout to enable zooming
fig.update_geos(projection_type="natural earth", showcountries=True, showcoastlines=True)
# Show the plot
fig.show()
In [13]:
import pandas as pd
import plotly.graph_objects as go
# Read the CSV file
csv_file_path = r"C:\Users\User\Desktop\Data Analytics\activesellers.csv"
df = pd.read_csv(csv_file_path)
# Create a 3D bar graph using Plotly
fig = go.Figure(data=[
go.Scatter3d(
x=df['listedproducts'],
y=df['totalunitssold'],
z=df['rating'],
mode='markers', # You can also use 'lines' mode if you prefer
marker=dict(
size=12,
color=df['rating'], # Set color based on rating
colorscale='Viridis', # Choose a colorscale
opacity=0.8,
colorbar=dict(title='Rating') # Add a colorbar
),
text=df['merchantid'], # Hover text
hoverinfo='text'
)
])
# Update layout to add titles and axis labels, and enable zooming
fig.update_layout(
title='Interactive 3D Bar Graph',
scene=dict(
xaxis=dict(title='Listed Products'),
yaxis=dict(title='Total Units Sold'),
zaxis=dict(title='Rating'),
camera=dict(
eye=dict(x=1.2, y=1.2, z=1.2), # Set initial camera position
center=dict(x=0, y=0, z=0), # Set camera center
up=dict(x=0, y=0, z=1), # Set up direction
)
)
)
# Show the plot
fig.show()
In [14]:
import pandas as pd
import plotly.graph_objects as go
# Read the CSV file
csv_file_path = r"C:\Users\User\Desktop\Data Analytics\activesellers.csv"
df = pd.read_csv(csv_file_path)
# Display the columns in the DataFrame to identify the correct column names
print(df.columns)
# Update the column names according to your data
# Replace 'Date', 'Listed Products', 'Total Units Sold', and 'Rating' with the appropriate column names from your DataFrame
# Create a line chart using Plotly
fig = go.Figure()
# Add line traces for each attribute
fig.add_trace(go.Scatter(x=df['totalurgencycount'], y=df['listedproducts'], mode='lines', name='Listed Products'))
fig.add_trace(go.Scatter(x=df['totalurgencycount'], y=df['totalunitssold'], mode='lines', name='Total Units Sold'))
fig.add_trace(go.Scatter(x=df['totalurgencycount'], y=df['rating'], mode='lines', name='Rating'))
# Update layout to add titles and axis labels, and enable zooming
fig.update_layout(
title='Zoomable Line Chart',
xaxis=dict(title='Date', rangeslider=dict(visible=True)), # Enable rangeslider for zooming on x-axis
yaxis=dict(title='Values'),
hovermode='x' # Show hover information only for x-axis
)
# Show the plot
fig.show()
Index(['merchantid', 'listedproducts', 'totalunitssold',
'meanunitssoldperproduct', 'rating', 'merchantratingscount',
'meanproductprices', 'meanretailprices', 'averagediscount',
'meandiscount', 'meanproductratingscount', 'totalurgencycount',
'urgencytextrate'],
dtype='object')
In [15]:
import pandas as pd
import plotly.graph_objects as go
# Read the CSV file
csv_file_path = r"C:\Users\User\Desktop\Data Analytics\activesellers.csv"
df = pd.read_csv(csv_file_path)
# Display the columns in the DataFrame to identify the correct column names
print(df.columns)
# Update the column names according to your data
# Replace 'Listed Products', 'Total Units Sold', and 'Rating' with the appropriate column names from your DataFrame
# Create a column chart using Plotly
fig = go.Figure()
# Add column traces for each attribute
fig.add_trace(go.Bar(x=df['totalurgencycount'], y=df['listedproducts'], name='Listed Products'))
fig.add_trace(go.Bar(x=df['totalurgencycount'], y=df['totalunitssold'], name='Total Units Sold'))
fig.add_trace(go.Bar(x=df['totalurgencycount'], y=df['rating'], name='Rating'))
# Update layout to add titles and axis labels
fig.update_layout(
title='Column Chart',
xaxis=dict(title='Merchant ID'),
yaxis=dict(title='Values'),
barmode='group' # Use 'group' for grouped bars or 'stack' for stacked bars
)
# Show the plot
fig.show()
Index(['merchantid', 'listedproducts', 'totalunitssold',
'meanunitssoldperproduct', 'rating', 'merchantratingscount',
'meanproductprices', 'meanretailprices', 'averagediscount',
'meandiscount', 'meanproductratingscount', 'totalurgencycount',
'urgencytextrate'],
dtype='object')
In [16]:
import pandas as pd
import plotly.graph_objects as go
# Read the CSV file
csv_file_path = r"C:\Users\User\Desktop\Data Analytics\activesellers.csv"
df = pd.read_csv(csv_file_path)
# Display the columns in the DataFrame to identify the correct column names
print(df.columns)
# Update the column names according to your data
# Replace 'merchantid', 'listedproducts', and 'totalunitssold' with the appropriate column names from your DataFrame
# Create a Treemap chart using Plotly
fig = go.Figure(go.Treemap(
labels=df['merchantid'], # Labels for the Treemap
parents=[""] * len(df), # Empty strings indicate the root of the Treemap
values=df['listedproducts'], # Values for each rectangle in the Treemap
textinfo="label+value" # Display both label and value when hovering over a rectangle
))
# Update layout to add a title
fig.update_layout(
title='Treemap Chart'
)
# Show the plot
fig.show()
Index(['merchantid', 'listedproducts', 'totalunitssold',
'meanunitssoldperproduct', 'rating', 'merchantratingscount',
'meanproductprices', 'meanretailprices', 'averagediscount',
'meandiscount', 'meanproductratingscount', 'totalurgencycount',
'urgencytextrate'],
dtype='object')
In [17]:
import pandas as pd
import plotly.graph_objects as go
# Read the CSV file
csv_file_path = r"C:\Users\User\Desktop\Data Analytics\activesellers.csv"
df = pd.read_csv(csv_file_path)
# Calculate the cumulative sum of 'totalunitssold'
df_sorted = df.sort_values(by='totalunitssold', ascending=False)
df_sorted['cumulative_percentage'] = (df_sorted['totalunitssold'].cumsum() / df_sorted['totalunitssold'].sum()) * 100
# Create Pareto chart using Plotly
fig = go.Figure()
# Add bar chart for 'totalunitssold'
fig.add_trace(go.Bar(
x=df_sorted['merchantid'],
y=df_sorted['totalunitssold'],
name='Total Units Sold',
marker=dict(color='blue')
))
# Add line chart for cumulative percentage
fig.add_trace(go.Scatter(
x=df_sorted['merchantid'],
y=df_sorted['cumulative_percentage'],
name='Cumulative Percentage',
yaxis='y2',
line=dict(color='red', width=4)
))
# Update layout to add titles and axis labels
fig.update_layout(
title='Pareto Chart',
xaxis=dict(title='Merchant ID'),
yaxis=dict(title='Total Units Sold', side='left', color='blue'),
yaxis2=dict(title='Cumulative Percentage', overlaying='y', side='right', color='red', range=[0, 100])
)
# Show the plot
fig.show()
In [18]:
import pandas as pd
import plotly.graph_objects as go
# Sample data
data = {
'category': ['Starting', 'Sales', 'Refunds', 'Net Sales', 'Expenses', 'Profit'],
'amount': [100000, -30000, 5000, None, -20000, None]
}
# Create DataFrame
df = pd.DataFrame(data)
# Calculate the cumulative sum for the waterfall chart
df['cumulative'] = df['amount'].cumsum()
# Set up the text to display on the chart
text = ['Starting', 'Sales', 'Refunds', 'Net Sales', 'Expenses', 'Profit']
# Create the waterfall chart
fig = go.Figure(go.Waterfall(
name='20', orientation='v',
x=df['category'],
textposition='outside',
text=text,
y=df['cumulative'],
connector={'line': {'color': 'rgb(63, 63, 63)'}},
decreasing={'marker': {'color': 'red'}},
increasing={'marker': {'color': 'green'}},
totals={'marker': {'color': 'blue', 'line': {'color': 'blue', 'width': 3}}},
))
# Update layout
fig.update_layout(
title='Waterfall Chart',
showlegend=False
)
# Show the plot
fig.show()
In [26]:
import pandas as pd
import plotly.graph_objects as go
In [27]:
import pandas as pd
# Specify the path to your CSV file
csv_file_path = r"C:\Users\User\Desktop\Data Analytics\activesellers.csv"
# Load the CSV file into a Pandas DataFrame
df = pd.read_csv(csv_file_path)
In [28]:
print(df.head())
print(df.info())
print(df.describe())
merchantid listedproducts totalunitssold \
0 5357bcf2bb72c5504882e889 2 120000
1 5708773c3c02161b3f8c7900 5 107100
2 5417aada4ad3ab27e954b76c 2 100007
3 570f3a713a698c14278bb51e 1 100000
4 53082ea15aefb07dfe1f2a4f 1 100000
meanunitssoldperproduct rating merchantratingscount meanproductprices \
0 60000.0 4.219 320031.0 9.00
1 21420.0 3.934 139223.0 7.76
2 50004.0 4.053 108048.0 8.00
3 100000.0 3.889 19248.0 5.67
4 100000.0 4.036 366898.0 5.00
meanretailprices averagediscount meandiscount meanproductratingscount \
0 20.0 54.0 54.0 8836.0
1 34.2 61.0 61.0 4010.0
2 8.0 -1.0 -1.0 5531.0
3 19.0 71.0 71.0 18393.0
4 33.0 85.0 85.0 13789.0
totalurgencycount urgencytextrate
0 1.0 50.0
1 3.0 60.0
2 NaN NaN
3 1.0 100.0
4 NaN NaN
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 958 entries, 0 to 957
Data columns (total 13 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 merchantid 958 non-null object
1 listedproducts 958 non-null int64
2 totalunitssold 958 non-null int64
3 meanunitssoldperproduct 958 non-null float64
4 rating 958 non-null float64
5 merchantratingscount 958 non-null float64
6 meanproductprices 958 non-null float64
7 meanretailprices 958 non-null float64
8 averagediscount 958 non-null float64
9 meandiscount 958 non-null float64
10 meanproductratingscount 958 non-null float64
11 totalurgencycount 391 non-null float64
12 urgencytextrate 391 non-null float64
dtypes: float64(10), int64(2), object(1)
memory usage: 97.4+ KB
None
listedproducts totalunitssold meanunitssoldperproduct rating \
count 958.000000 958.000000 958.000000 958.000000
mean 1.641962 7124.483299 4407.605428 4.043994
std 1.246183 14363.521893 9167.077812 0.222137
min 1.000000 1.000000 1.000000 2.333000
25% 1.000000 100.000000 100.000000 3.931000
50% 1.000000 1000.000000 1000.000000 4.055500
75% 2.000000 10000.000000 5000.000000 4.190000
max 15.000000 120000.000000 100000.000000 5.000000
merchantratingscount meanproductprices meanretailprices \
count 9.580000e+02 958.000000 958.000000
mean 2.202045e+04 8.634906 24.802265
std 8.473232e+04 3.987599 30.258719
min 0.000000e+00 1.000000 1.000000
25% 1.373000e+03 6.000000 7.000000
50% 5.990500e+03 8.000000 11.000000
75% 1.723850e+04 11.000000 28.875000
max 2.174765e+06 49.000000 252.000000
averagediscount meandiscount meanproductratingscount \
count 958.000000 958.000000 958.000000
mean 28.721294 28.698017 922.538622
std 39.918708 39.914269 1925.750178
min -18.000000 -18.000000 0.000000
25% -10.000000 -10.000000 31.000000
50% 16.000000 16.000000 210.500000
75% 71.000000 71.000000 920.750000
max 97.000000 97.000000 20744.000000
totalurgencycount urgencytextrate
count 391.000000 391.000000
mean 1.209719 65.572890
std 0.583542 28.476426
min 1.000000 14.000000
25% 1.000000 50.000000
50% 1.000000 50.000000
75% 1.000000 100.000000
max 6.000000 100.000000
In [29]:
plt.figure(figsize=(10, 6))
sns.boxplot(x='totalurgencycount', y='totalunitssold', data=df)
plt.xlabel('Total Urgency Count')
plt.ylabel('Total Units Sold')
plt.title('Boxplot of Total Units Sold Grouped by Total Urgency Count')
plt.show()
In [30]:
plt.figure(figsize=(10, 6))
sns.violinplot(x='totalurgencycount', y='totalunitssold', data=df)
plt.xlabel('Total Urgency Count')
plt.ylabel('Total Units Sold')
plt.title('Violin Plot of Total Units Sold Grouped by Total Urgency Count')
plt.show()
In [31]:
sns.pairplot(df[['listedproducts', 'totalunitssold', 'meanunitssoldperproduct', 'rating']])
plt.title('Pairplot of Selected Numerical Variables')
plt.show()
In [32]:
import seaborn as sns
import matplotlib.pyplot as plt
# Select only numeric columns
numeric_df = df.select_dtypes(include=['number'])
plt.figure(figsize=(10, 8))
sns.heatmap(numeric_df.corr(), annot=True, cmap='coolwarm', linewidths=0.5)
plt.title('Correlation Heatmap')
plt.show()
In [ ]: